import pymysql
import requests
from bs4 import BeautifulSoup

from db_helper import DbHelper


def get_html(url):
    try:
        r = requests.get(url,timeout=5)
        r.raise_for_status()
        return r.text
    except Exception as e:
        print('get type1 error:', e)
        return -1


def get_category(url):
    cate1s = ['城市信息','自然科学','社会科学','工程应用','农林渔畜','医学医药','电子游戏','艺术设计','生活百科','运动休闲','人文科学','娱乐休闲']
    res = []
    html1 = get_html(url)
    soup1 = BeautifulSoup(html1, 'lxml')
    cate1_url = soup1.find('div', {'id': 'dict_nav_list'})
    a1_list = cate1_url.find_all('a')
    for i in range(len(a1_list)):
        html2 = get_html('https://pinyin.sogou.com' + a1_list[i]['href'])
        soup2 = BeautifulSoup(html2,'lxml')
        cate2_no_child_list = soup2.find_all('div',{'class':'cate_no_child no_select'})
        cate2_has_child_list = soup2.find_all('div',{'class':'cate_has_child no_select'})
        cate2_list = cate2_no_child_list + cate2_has_child_list
        for cate2 in cate2_list:
            link = 'https://pinyin.sogou.com' + cate2.find('a')['href'] + '/default/'

            html3 = get_html(link)
            soup3 = BeautifulSoup(html3,'lxml')
            page_list = soup3.find('div',{'id':'dict_page_list'})
            li_list = page_list.find_all('li')
            try:
                page_num = li_list[-2].text
            except:
                page_num = 1

            # print('url:{}\tpage:{}\tcate1:{}\tcate2:{}'.format(link,page_num,cate1s[i],cate2.text.strip().replace('"','')))
            res.append({'url':link,'page':page_num,'cate1':cate1s[i],'cate2':cate2.text.strip().replace('"','')})
    return res

def get_download(url):
    titles = []
    download_urls = []
    html = get_html(url)
    soup = BeautifulSoup(html,'lxml')
    title_div_list = soup.find_all('div',{'class':'detail_title'})
    download_div_list = soup.find_all('div',{'class':'dict_dl_btn'})
    for title_div in title_div_list:
        title = title_div.find('a').text
        titles.append(title)
    for download_div in download_div_list:
        download_url = download_div.find('a')['href']
        download_urls.append(download_url)
    return titles,download_urls

if __name__ == '__main__':
    cates = get_category('https://pinyin.sogou.com/dict/cate/index/167')
    for cate in cates:
        for i in range(1,int(cate['page']) + 1):
            url = cate['url'] + str(i)
            titles,download_urls = get_download(url)
            for j in range(len(titles)):
                filename = '{}_{}_{}'.format(cate['cate1'],cate['cate2'],titles[j])
                print('url:{}\tfilename:{}\tcate1:{}\tcate2:{}'.format(download_urls[j],filename,cate['cate1'],cate['cate2']))
    # configs = {'host': '127.0.0.1', 'user': 'root', 'password': 'admin', 'db': 'sogou'}
    # db = DbHelper().connenct(configs)
